In [53]:
%matplotlib inline
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from MasterSeer import MasterSeer
from sklearn.feature_selection import SelectPercentile, f_classif, SelectFromModel
from sklearn.linear_model import LinearRegression
from lifelines.plotting import plot_lifetimes
from lifelines import KaplanMeierFitter
from numpy.random import uniform, exponential
from pandas.tools.plotting import scatter_matrix, radviz, parallel_coordinates
In [57]:
FEATURES = [
"Birth Year",
"Age at Diagnosis",
"Race",
"Origin",
"laterality",
"Radiation",
"Histrec",
"ER Status",
"PR Status",
"Behanal",
"Stage",
"Numprimes",
"Survival Time",
"Bucket"
]
LABEL_MAP = {
0: "< 60 Months",
1: "60 < months > 120",
2: "> 120 months",
}
# Read the data into a DataFrame
df = pd.read_csv("clean1.csv", sep=',' , header=0, names=FEATURES)
# Convert class labels into text
for k,v in LABEL_MAP.items():
df.ix[df.Bucket == k, 'Bucket'] = v
print(df.head(n=5))
df.describe()
Out[57]:
In [58]:
print (df.groupby('Bucket')['Bucket'].count())
In [59]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['Survival Time'], bins = 10, range = (df['Survival Time'].min(),df['Survival Time'].max()))
plt.title('Survival Time Distribution')
plt.xlabel('Survival Time')
plt.ylabel('Months')
plt.show()
In [60]:
scatter_matrix(df, alpha=0.2, figsize=(12, 12), diagonal='kde')
plt.show()
In [61]:
plt.figure(figsize=(12,12))
parallel_coordinates(df, 'Bucket')
plt.show()
In [62]:
plt.figure(figsize=(12,12))
radviz(df, 'Bucket')
plt.show()
In [63]:
class ExploreSeer(MasterSeer):
def __init__(self, path=r'./data/', testMode=False, verbose=True, sample_size=5000):
# user supplied parameters
self.testMode = testMode # import one file, 500 records and return
self.verbose = verbose # prints status messages
self.sample_size = sample_size # number of rows to pull for testing
if type(path) != str:
raise TypeError('path must be a string')
if path[-1] != '/':
path += '/' # if path does not end with a backslash, add one
self.path = path
# open connection to the database
super().__init__(path, False, verbose=verbose)
self.db_conn, self.db_cur = super().init_database(False)
def __del__(self):
super().__del__()
def plot_survival(self):
df = super().load_data(col = ['YR_BRTH','AGE_DX','LATERAL','RADIATN','HISTREC','ERSTATUS',
'PRSTATUS','BEHANAL','HST_STGA','NUMPRIMS', 'SRV_TIME_MON',
'SRV_TIME_MON_PA', 'DTH_CLASS', 'O_DTH_CLASS', 'STAT_REC'],
cond = 'SRV_TIME_MON < 1000 AND HST_STGA < 8 AND DTH_CLASS < 9 AND ERSTATUS < 4 AND PRSTATUS < 4',
sample_size = 100000)
kmf = KaplanMeierFitter()
try:
df.RADIATN = df.RADIATN.replace(7, 0)
df = df[df.RADIATN < 7]
except Exception as err:
pass
# 0-negative, 1-borderline,, 2-positive
df = df[df.ERSTATUS != 4]
df = df[df.ERSTATUS != 9]
df.ERSTATUS = df.ERSTATUS.replace(2, 0)
df.ERSTATUS = df.ERSTATUS.replace(1, 2)
df.ERSTATUS = df.ERSTATUS.replace(3, 1)
# 0-negative, 1-borderline,, 2-positive
df = df[df.PRSTATUS != 4]
df = df[df.PRSTATUS != 9]
df.PRSTATUS = df.PRSTATUS.replace(2, 0)
df.PRSTATUS = df.PRSTATUS.replace(1, 2)
df.PRSTATUS = df.PRSTATUS.replace(3, 1)
rad = df.RADIATN > 0
er = df.ERSTATUS > 0
pr = df.PRSTATUS > 0
st0 = df.HST_STGA == 0
st1 = df.HST_STGA == 1
st2 = df.HST_STGA == 2
st4 = df.HST_STGA == 4
age = df.AGE_DX < 50
df['SRV_TIME_YR'] = df['SRV_TIME_MON'] / 12
T = df['SRV_TIME_YR']
#C = (np.logical_or(df.DTH_CLASS == 1, df.O_DTH_CLASS == 1))
C = df.STAT_REC == 4
f, ax = plt.subplots(5, sharex=True, sharey=True)
ax[0].set_title("Lifespans of cancer patients");
# radiation
kmf.fit(T[rad], event_observed=C[rad], label="Radiation")
kmf.plot(ax=ax[0]) #, ci_force_lines=True)
kmf.fit(T[~rad], event_observed=C[~rad], label="No Radiation")
kmf.plot(ax=ax[0]) #, ci_force_lines=True)
# ER Status
kmf.fit(T[er], event_observed=C[er], label="ER Positive")
kmf.plot(ax=ax[1]) #, ci_force_lines=True)
kmf.fit(T[~er], event_observed=C[~er], label="ER Negative")
kmf.plot(ax=ax[1]) #, ci_force_lines=True)
# PR Status
kmf.fit(T[pr], event_observed=C[pr], label="PR Positive")
kmf.plot(ax=ax[2]) #, ci_force_lines=True)
kmf.fit(T[~pr], event_observed=C[~pr], label="PR Negative")
kmf.plot(ax=ax[2]) #, ci_force_lines=True)
# stage
kmf.fit(T[st0], event_observed=C[st0], label="Stage 0")
kmf.plot(ax=ax[3]) #, ci_force_lines=True)
kmf.fit(T[st1], event_observed=C[st1], label="Stage 1")
kmf.plot(ax=ax[3]) #, ci_force_lines=True)
kmf.fit(T[st2], event_observed=C[st2], label="Stage 2")
kmf.plot(ax=ax[3]) #, ci_force_lines=True)
kmf.fit(T[st4], event_observed=C[st4], label="Stage 4")
kmf.plot(ax=ax[3]) #, ci_force_lines=True)
# age
kmf.fit(T[age], event_observed=C[age], label="Age < 50")
kmf.plot(ax=ax[4]) #, ci_force_lines=True)
kmf.fit(T[~age], event_observed=C[~age], label="Age >= 50")
kmf.plot(ax=ax[4]) #, ci_force_lines=True)
ax[0].legend(loc=3,prop={'size':10})
ax[1].legend(loc=3,prop={'size':10})
ax[2].legend(loc=3,prop={'size':10})
ax[3].legend(loc=3,prop={'size':10})
ax[4].legend(loc=3,prop={'size':10})
ax[len(ax)-1].set_xlabel('Survival in years')
f.text(0.04, 0.5, 'Survival %', va='center', rotation='vertical')
plt.tight_layout()
plt.ylim(0,1);
plt.show()
f, ax = plt.subplots(2, sharex=True, sharey=True)
df.hist('SRV_TIME_YR', by=df.STAT_REC != 4, ax=(ax[0], ax[1]))
ax[0].set_title('Histogram of Non Censored Patients')
ax[0].set_ylabel('Number of Patients')
ax[1].set_ylabel('Number of Patients')
ax[1].set_title('Histogram of Censored Patients')
ax[1].set_xlabel('Survival in Years')
plt.show()
return
# second plot of survival
fig, ax = plt.subplots(figsize=(8, 6))
cen = df[df.STAT_REC != 4].SRV_TIME_MON
nc = df[df.STAT_REC == 4].SRV_TIME_MON
cen = cen.sort_values()
nc = nc.sort_values()
ax.hlines([x for x in range(len(nc))] , 0, nc , color = 'b', label='Uncensored');
ax.hlines([x for x in range(len(nc), len(nc)+len(cen))], 0, cen, color = 'r', label='Censored');
ax.set_xlim(left=0);
ax.set_xlabel('Months');
ax.set_ylim(-0.25, len(df) + 0.25);
ax.legend(loc='best');
plt.show()
return
In [64]:
seer = ExploreSeer(sample_size=10000)
seer.plot_survival()
In [ ]: